Learning Goals

Lab Description

We will work with two Starbucks datasets, one on the store locations (global) and one for the nutritional data for their food and drink items. We will do some text analysis of the menu items.

Steps

0. Install and load libraries

1. Read in the data

  • There are 4 datasets to read in, Starbucks locations, Starbucks nutrition, US population by state, and US state abbreviations.
sb_locs <- read_csv("starbucks-locations.csv")
## Rows: 25600 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Brand, Store Number, Store Name, Ownership Type, Street Address, C...
## dbl  (2): Longitude, Latitude
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sb_nutr <- read_csv("starbucks-menu-nutrition.csv")
## Rows: 205 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Item, Category
## dbl (5): Calories, Fat (g), Carb. (g), Fiber (g), Protein (g)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usa_pop <- read_csv("us_state_pop.csv")
## Rows: 55 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): state
## dbl (1): population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usa_states<-read_csv("states.csv")
## Rows: 51 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): State, Abbreviation
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

2. Look at the data

  • Inspect each dataset to look at variable names and ensure it was imported correctly
head(sb_locs)
## # A tibble: 6 × 13
##   Brand     `Store Number` `Store Name`  `Ownership Type` `Street Address` City 
##   <chr>     <chr>          <chr>         <chr>            <chr>            <chr>
## 1 Starbucks 47370-257954   Meritxell, 96 Licensed         Av. Meritxell, … Ando…
## 2 Starbucks 22331-212325   Ajman Drive … Licensed         1 Street 69, Al… Ajman
## 3 Starbucks 47089-256771   Dana Mall     Licensed         Sheikh Khalifa … Ajman
## 4 Starbucks 22126-218024   Twofour 54    Licensed         Al Salam Street  Abu …
## 5 Starbucks 17127-178586   Al Ain Tower  Licensed         Khaldiya Area, … Abu …
## 6 Starbucks 17688-182164   Dalma Mall, … Licensed         Dalma Mall, Mus… Abu …
## # ℹ 7 more variables: `State/Province` <chr>, Country <chr>, Postcode <chr>,
## #   `Phone Number` <chr>, Timezone <chr>, Longitude <dbl>, Latitude <dbl>
str(sb_locs)
## spc_tbl_ [25,600 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Brand         : chr [1:25600] "Starbucks" "Starbucks" "Starbucks" "Starbucks" ...
##  $ Store Number  : chr [1:25600] "47370-257954" "22331-212325" "47089-256771" "22126-218024" ...
##  $ Store Name    : chr [1:25600] "Meritxell, 96" "Ajman Drive Thru" "Dana Mall" "Twofour 54" ...
##  $ Ownership Type: chr [1:25600] "Licensed" "Licensed" "Licensed" "Licensed" ...
##  $ Street Address: chr [1:25600] "Av. Meritxell, 96" "1 Street 69, Al Jarf" "Sheikh Khalifa Bin Zayed St." "Al Salam Street" ...
##  $ City          : chr [1:25600] "Andorra la Vella" "Ajman" "Ajman" "Abu Dhabi" ...
##  $ State/Province: chr [1:25600] "7" "AJ" "AJ" "AZ" ...
##  $ Country       : chr [1:25600] "AD" "AE" "AE" "AE" ...
##  $ Postcode      : chr [1:25600] "AD500" NA NA NA ...
##  $ Phone Number  : chr [1:25600] "376818720" NA NA NA ...
##  $ Timezone      : chr [1:25600] "GMT+1:00 Europe/Andorra" "GMT+04:00 Asia/Dubai" "GMT+04:00 Asia/Dubai" "GMT+04:00 Asia/Dubai" ...
##  $ Longitude     : num [1:25600] 1.53 55.47 55.47 54.38 54.54 ...
##  $ Latitude      : num [1:25600] 42.5 25.4 25.4 24.5 24.5 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Brand = col_character(),
##   ..   `Store Number` = col_character(),
##   ..   `Store Name` = col_character(),
##   ..   `Ownership Type` = col_character(),
##   ..   `Street Address` = col_character(),
##   ..   City = col_character(),
##   ..   `State/Province` = col_character(),
##   ..   Country = col_character(),
##   ..   Postcode = col_character(),
##   ..   `Phone Number` = col_character(),
##   ..   Timezone = col_character(),
##   ..   Longitude = col_double(),
##   ..   Latitude = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
distinct(sb_locs, Country) |> pull(Country)
##  [1] "AD" "AE" "AR" "AT" "AU" "AW" "AZ" "BE" "BG" "BH" "BN" "BO" "BR" "BS" "CA"
## [16] "CH" "CL" "CN" "CO" "CR" "CW" "CY" "CZ" "DE" "DK" "EG" "ES" "FI" "FR" "GB"
## [31] "GR" "GT" "HU" "ID" "IE" "IN" "JO" "JP" "KH" "KR" "KW" "KZ" "LB" "LU" "MA"
## [46] "MC" "MX" "MY" "NL" "NO" "NZ" "OM" "PA" "PE" "PH" "PL" "PR" "PT" "QA" "RO"
## [61] "RU" "SA" "SE" "SG" "SK" "SV" "TH" "TR" "TT" "TW" "US" "VN" "ZA"
head(sb_nutr)
## # A tibble: 6 × 7
##   Item         Category Calories `Fat (g)` `Carb. (g)` `Fiber (g)` `Protein (g)`
##   <chr>        <chr>       <dbl>     <dbl>       <dbl>       <dbl>         <dbl>
## 1 Chonga Bagel Food          300         5          50           3            12
## 2 8-Grain Roll Food          380         6          70           7            10
## 3 Almond Croi… Food          410        22          45           3            10
## 4 Apple Fritt… Food          460        23          56           2             7
## 5 Banana Nut … Food          420        22          52           2             6
## 6 Blueberry M… Food          380        16          53           1             6
str(sb_nutr)
## spc_tbl_ [205 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Item       : chr [1:205] "Chonga Bagel" "8-Grain Roll" "Almond Croissant" "Apple Fritter" ...
##  $ Category   : chr [1:205] "Food" "Food" "Food" "Food" ...
##  $ Calories   : num [1:205] 300 380 410 460 420 380 420 240 350 320 ...
##  $ Fat (g)    : num [1:205] 5 6 22 23 22 16 17 12 22 16 ...
##  $ Carb. (g)  : num [1:205] 50 70 45 56 52 53 61 28 38 36 ...
##  $ Fiber (g)  : num [1:205] 3 7 3 2 2 1 2 1 0 1 ...
##  $ Protein (g): num [1:205] 12 10 10 7 6 6 5 5 2 8 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Item = col_character(),
##   ..   Category = col_character(),
##   ..   Calories = col_double(),
##   ..   `Fat (g)` = col_double(),
##   ..   `Carb. (g)` = col_double(),
##   ..   `Fiber (g)` = col_double(),
##   ..   `Protein (g)` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
head(usa_pop)
## # A tibble: 6 × 2
##   state      population
##   <chr>           <dbl>
## 1 Alabama       4779736
## 2 Alaska         710231
## 3 Arizona       6392017
## 4 Arkansas      2915918
## 5 California   37253956
## 6 Colorado      5029196
str(usa_pop)
## spc_tbl_ [55 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ state     : chr [1:55] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ population: num [1:55] 4779736 710231 6392017 2915918 37253956 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   state = col_character(),
##   ..   population = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
head(usa_states)
## # A tibble: 6 × 2
##   State      Abbreviation
##   <chr>      <chr>       
## 1 Alabama    AL          
## 2 Alaska     AK          
## 3 Arizona    AZ          
## 4 Arkansas   AR          
## 5 California CA          
## 6 Colorado   CO
str(usa_states)
## spc_tbl_ [51 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ State       : chr [1:51] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Abbreviation: chr [1:51] "AL" "AK" "AZ" "AR" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   State = col_character(),
##   ..   Abbreviation = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

3. Format the data

  • Subset Starbucks data to the US.
  • Create counts of Starbucks stores by state.
  • Merge population in with the store count by state.
  • Inspect the range values for each variable.
sb_usa <- sb_locs |> filter(Country == "US")
sb_locs_state <- sb_usa |>
  group_by(`State/Province`) |>
  rename(Abbreviation = `State/Province`) |>
  summarize(n_stores = n())

# need state abbreviations
usa_pop_abbr <- usa_states |> 
  full_join(usa_pop, 
            by = c("State" = "state")) 

sb_locs_state <- full_join(sb_locs_state, usa_pop_abbr, by = "Abbreviation")
sb_locs_state
## # A tibble: 55 × 4
##    Abbreviation n_stores State                population
##    <chr>           <int> <chr>                     <dbl>
##  1 AK                 49 Alaska                   710231
##  2 AL                 85 Alabama                 4779736
##  3 AR                 55 Arkansas                2915918
##  4 AZ                488 Arizona                 6392017
##  5 CA               2821 California             37253956
##  6 CO                481 Colorado                5029196
##  7 CT                123 Connecticut             3574097
##  8 DC                 91 District of Columbia     601723
##  9 DE                 25 Delaware                 897934
## 10 FL                694 Florida                18801310
## # ℹ 45 more rows
summary(sb_locs_state)
##  Abbreviation          n_stores         State             population      
##  Length:55          Min.   :   8.0   Length:55          Min.   :   56882  
##  Class :character   1st Qu.:  56.5   Class :character   1st Qu.: 1344331  
##  Mode  :character   Median : 123.0   Mode  :character   Median : 3751351  
##                     Mean   : 266.8                      Mean   : 5677621  
##                     3rd Qu.: 332.0                      3rd Qu.: 6515716  
##                     Max.   :2821.0                      Max.   :37253956  
##                     NA's   :4

4. Use ggplotly for EDA

Answer the following questions:

  • Are the number of Starbucks proportional to the population of a state? (scatterplot)

  • Is the caloric distribution of Starbucks menu items different for drinks and food? (histogram)

  • What are the top 20 words in Starbucks menu items? (bar plot)

p1 <- ggplot(sb_locs_state, aes(x = population, y = n_stores)) +
  geom_point() +
  theme_minimal() +
  labs(x = "Population", y = "Number of Starbucks Stores", title = "Starbucks Stores vs. State Population") +
  geom_smooth()

ggplotly(p1)
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).

The number of Starbucks is possitively proportional to the population of a state.

p2 <- ggplot(sb_nutr, aes(x = Calories, fill = Category)) +
  geom_histogram(position = "identity", alpha = 0.8, bins = 30) +
  theme_minimal() +
  labs(x = "Calories", y = "n_stores", title = "Caloric Distribution of Starbucks Menu Items")

ggplotly(p2)

The caloric distribution of Starbucks menu items is different for drinks and food. Food tends to have higher calories.

p3<- sb_nutr |>
  unnest_tokens(word, Item) |>
  count(word, sort = TRUE) |>
  head(20) |>
  ggplot(aes(x = reorder(word, n), y = n))+
  geom_col()+
  coord_flip()+
  labs(x = "Frequency", y = "Word", title = "Top 20 Words in Starbucks Menu Items")

ggplotly(p3)

5. Scatterplots using plot_ly()

  • Create a scatterplot using plot_ly() representing the relationship between calories and carbs
  • Color points by category
cal_carbs_plot <- sb_nutr |>
  plot_ly(x = ~Calories, y = ~`Carb. (g)`,
          type = 'scatter', 
          mode = 'markers', 
          color = ~Category,
          marker = list(sizemode='diameter')) |>
  layout(title = "Relationship between Calories and Carbs",
         xaxis = list(title = "Calories"),
         yaxis = list(title = "Carbs"))

cal_carbs_plot
  • Create this scatterplot but for the items consisting of the top 10 words
  • Color again by category
  • Add hoverinfo specifying the word in the item name
  • Add layout information to title the chart and the axes
  • Enable hovermode = "compare"
top_10 <- sb_nutr |>
  unnest_tokens(word, Item) |>
  count(word, sort = TRUE) |>
  head(10) |>
  select(word) |>
  unlist()

sb_nutr_top_words <- sb_nutr |>
  mutate(contains_word = str_detect(Item, regex(paste(top_10, collapse = "|"), ignore_case = TRUE))) |>
  filter(contains_word == TRUE)
cal_carb_top_words <- sb_nutr_top_words |>
  plot_ly(x = ~Calories, y = ~`Carb. (g)`,
          type = 'scatter',
          mode = 'markers',
          color = ~Category,
          hoverinfo = 'text',
          text = ~paste("Item:", Item),
          marker = list(sizemode='diameter')) |>
  layout(title = "Calories vs Carbs for Items with Top 10 Words ",
  xaxis = list(title = "Calories"),
  yaxis = list(title = "Carbs"),
  hovermode = "compare")

cal_carb_top_words

6. plot_ly Boxplots

  • Create a boxplot of all of the nutritional variables in groups by the 10 item words.
identify_top_words <- function(item_name, top_words) {
  contained_words <- top_words[sapply(top_words, function(word) str_detect(item_name, regex(word, ignore_case = TRUE)))]
  paste(contained_words, collapse = ", ")
}
with_words <- sb_nutr_top_words |>
  mutate(TopWordsContained = sapply(Item, identify_top_words, top_words = top_10)) |>
  separate_rows(TopWordsContained, sep = ",\\s*")
with_words
with_words_long <- with_words |>
  pivot_longer(
    cols = c(`Fat (g)`, `Carb. (g)`, `Fiber (g)`, `Protein (g)`), 
    names_to = "Nutrient", 
    values_to = "Value")

plot_ly(data = with_words_long, 
        x = ~TopWordsContained, 
        y = ~Value, type = 'box',
        color = ~Nutrient) |>
  layout(yaxis = list(title = 'Nutritional Value'), 
         xaxis = list(title = 'Top Words'), 
         title = 'Nutritional Content by Top Words')

7. 3D Scatterplot

  • Create a 3D scatterplot between Calories, Carbs, and Protein for the items containing the top 10 words
  • Do you see any patterns?
sb_nutr_top_words |>
  plot_ly(x = ~Calories, y = ~`Carb. (g)`, z = ~`Protein (g)`,
          type = 'scatter3d', mode = 'markers',
          sizes = c(5, 70),
          marker = list(sizemode='area', opacity=0.5)) |>
  layout(title = '3D Scatterplot of Nutrition Facts for Top Word Items',
         scene = list(xaxis = list(title = 'Calories'),
                      yaxis = list(title = 'Carbs'),
                      zaxis = list(title = 'Protein')))

Higher calories come with higher carbs and higher protein. The three variables are positively correlated.

8. plot_ly Map

  • Create a map to visualize the number of stores per state, and another for the population by state. Use subplot to put the maps side by side.
  • Describe the differences if any.
# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('steelblue')
)

# Make sure both maps are on the same color scale
shadeLimit <- 125

# Create hover text
sb_locs_state$hover <- with(sb_locs_state, paste("Number of Starbucks: ", n_stores, '<br>', "State: ", Abbreviation, '<br>', "Population: ", population))

usa_pop_abbr$hover <- with(usa_pop_abbr, paste("State: ", Abbreviation, '<br>', "Population: ", population))


# Create the map
map1 <- plot_geo(sb_locs_state, locationmode = 'USA-states') |>
  add_trace(
    z = ~n_stores, locations = ~Abbreviation,
    hoverinfo = "text",
    text = ~hover,
    colors = 'Purples', 
    marker = list(line = list(color = 'rgb(255,255,255)', width = 2))
  ) |>
  colorbar(title = "Stores") |>
  layout(title = 'Starbucks Stores per State', geo = set_map_details)

map1
map2 <- plot_geo(usa_pop_abbr, locationmode = 'USA-states') |>
  add_trace(
    z = ~population, locations = ~Abbreviation,
    hoverinfo = "text",
    text = ~hover,
    colors = 'Greens', 
    marker = list(line = list(color = 'rgb(255,255,255)', width = 2))
  ) |>
  colorbar(title = "Population") |>
  layout(title = 'Population by State', geo = set_map_details)

map2
subplot(map1, map2)